if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
if (!require("GEOquery", quietly = TRUE))
BiocManager::install("GEOquery")
if (!require("rScudo", quietly = TRUE))
BiocManager::install("rScudo")
library(tidyverse)
library(dplyr)
library(GEOquery)
library(rScudo)
library(plotly)
The analysis will use the dataset GSE20437 obtained from GEO.The dataset is generated from Affymetrix HU133A microarrays and contains 42 tissue samples.
In detail, the data includes:
18 reduction mammoplasty (RM) breast epithelium samples,
18 histologically normal (HN) epithelial samples from breast cancer patients (9 ER+ and 9 ER-), and
6 histologically normal epithelial samples from prophylactic mastectomy patients.
Note that sample numbers correspond to individual patient samples.
# download the GSE20437 expression data series
gse <- getGEO("GSE20437", destdir= './data/', getGPL = F)
## Found 1 file(s)
## GSE20437_series_matrix.txt.gz
## Using locally cached version: ./data//GSE20437_series_matrix.txt.gz
# getGEO returns a list of expression objects, but...
length(gse)
## [1] 1
# shows us there is only one object in it.
# We assign it to the same variable.
gse <- gse[[1]]
# show what we have:
show(gse)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 22283 features, 42 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: GSM512539 GSM512540 ... GSM512580 (42 total)
## varLabels: title geo_accession ... tissue:ch1 (38 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 20197764
## Annotation: GPL96
The actual expression data are accessible in the exprs
section of gse, an Expression Set and the generic data
class that BioConductor uses for expression data.
head(exprs(gse))
## GSM512539 GSM512540 GSM512541 GSM512542 GSM512543 GSM512544 GSM512545
## 1007_s_at 2461.4 3435.7 1932.5 2377.7 3055.3 2978.1 2348.5
## 1053_at 26.7 159.0 31.2 140.7 69.9 98.5 37.0
## 117_at 82.6 243.4 150.2 95.1 209.3 103.4 91.2
## 121_at 942.3 897.5 840.8 870.9 685.4 791.8 886.5
## 1255_g_at 71.8 87.9 75.4 58.1 31.8 40.3 70.5
## 1294_at 630.2 571.4 346.3 679.9 1289.3 421.1 417.6
## GSM512546 GSM512547 GSM512548 GSM512549 GSM512550 GSM512551 GSM512552
## 1007_s_at 2963.9 2776.9 3088.9 3033.3 3037.1 3545.8 3322.6
## 1053_at 59.9 86.7 107.2 64.0 82.9 97.7 69.7
## 117_at 168.4 162.7 203.2 143.7 113.5 80.0 186.4
## 121_at 954.2 843.1 775.3 847.6 912.2 911.6 862.4
## 1255_g_at 43.3 51.6 42.6 74.9 53.7 30.5 15.2
## 1294_at 811.6 778.1 393.2 995.4 987.7 938.5 924.6
## GSM512553 GSM512554 GSM512555 GSM512556 GSM512557 GSM512558 GSM512559
## 1007_s_at 1963.7 3609.6 2078.9 4138.6 4260.7 2453.6 2709.0
## 1053_at 82.0 45.6 84.5 31.7 37.4 82.4 204.8
## 117_at 106.6 145.6 144.4 133.6 278.6 173.0 147.8
## 121_at 705.0 984.6 853.8 846.8 1273.0 833.6 908.1
## 1255_g_at 42.5 76.6 88.2 90.6 65.8 25.8 77.5
## 1294_at 480.8 1054.1 632.0 448.0 1345.2 1248.9 405.7
## GSM512560 GSM512561 GSM512562 GSM512563 GSM512564 GSM512565 GSM512566
## 1007_s_at 2612.5 4340.1 3155.3 2390.3 2738.8 3233.1 2836.6
## 1053_at 119.3 76.7 100.3 115.4 14.1 47.6 77.1
## 117_at 186.0 168.0 95.2 73.6 122.7 107.6 120.9
## 121_at 806.2 827.0 629.4 709.2 305.6 877.4 425.7
## 1255_g_at 84.3 87.9 44.6 59.3 12.0 82.1 59.2
## 1294_at 647.5 2218.1 1321.1 606.7 1709.9 980.8 1268.4
## GSM512567 GSM512568 GSM512569 GSM512570 GSM512571 GSM512572 GSM512573
## 1007_s_at 2915.4 3457.5 2798.7 4370.2 2467.3 3669.5 3310.1
## 1053_at 47.1 47.0 83.2 40.2 80.3 24.1 8.8
## 117_at 143.4 92.5 72.1 131.8 156.4 165.8 141.6
## 121_at 643.8 771.3 681.1 812.7 533.4 746.9 1090.3
## 1255_g_at 62.2 28.3 97.6 8.1 17.9 53.0 39.9
## 1294_at 955.8 1157.5 888.6 1130.8 905.1 1138.5 483.0
## GSM512574 GSM512575 GSM512576 GSM512577 GSM512578 GSM512579 GSM512580
## 1007_s_at 3942.2 4520.4 3596.1 2989.0 3164.5 2764.3 4258.5
## 1053_at 44.6 54.7 56.7 89.9 63.4 57.0 59.5
## 117_at 97.1 132.7 124.3 210.5 131.4 89.6 123.3
## 121_at 1008.7 718.6 988.4 295.9 957.3 630.8 869.2
## 1255_g_at 11.0 50.2 60.0 34.3 33.5 61.7 50.4
## 1294_at 1326.5 1179.4 668.3 863.2 1055.5 1287.6 1127.8
To conveniently access the data rows and columns present in
exprs(gse), this matrix is assigned to its own variable
ex.
# exprs (gse) is a matrix that we can assign to its own variable, to
# conveniently access the data rows and columns
ex <- exprs(gse)
dim(ex) # 42 sample, 22283 genes
## [1] 22283 42
The dataset contains gene expression data of 22283 genes (rows) from 42 patients (columns).
# Analyze value distributions
#boxplot(ex)
boxplot(ex, main = 'Boxplot of the data before normalization')
I try to apply a log transformation to the data to obtain a boxplot easier to read.
ex2<-log(ex)
ex2 <- na.omit(as.matrix(ex2))
boxplot(ex2, main = 'Boxplot of the data after applying a logarithmic transformation')
Boxplot of the data after applying a logarithmic transformation
From the boxplot after the log transformation, I can see that there is some variation in the median of the samples. So, I try to apply a median normalization to the data after the log transformation.
# MEDIAN NORMALIZATION
channel.medians=apply(log(ex),2,median)
normalized.log.ex=sweep(log(ex),2,channel.medians,"-")
# boxplot post median normalization on ex
boxplot(normalized.log.ex, main = 'Boxplot of the data after median normalization')
Boxplot of the data after median normalization
pca <- prcomp(t(normalized.log.ex))
summary(pca) # get the summary of the PCA, we get a table in which each column is a principal components and we have information on the standard deviation and the variation, etc
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 38.8020 23.47065 20.63892 19.54327 18.43330 15.97431
## Proportion of Variance 0.1791 0.06552 0.05067 0.04543 0.04042 0.03035
## Cumulative Proportion 0.1791 0.24461 0.29527 0.34070 0.38112 0.41147
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 15.54787 15.25339 14.95988 13.82648 13.67761 13.63388
## Proportion of Variance 0.02875 0.02767 0.02662 0.02274 0.02225 0.02211
## Cumulative Proportion 0.44023 0.46790 0.49452 0.51726 0.53951 0.56162
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 13.39954 13.05683 12.96288 12.78929 12.64910 12.48152
## Proportion of Variance 0.02136 0.02028 0.01999 0.01946 0.01903 0.01853
## Cumulative Proportion 0.58298 0.60326 0.62324 0.64270 0.66173 0.68026
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 12.32645 12.13869 12.04722 11.97088 11.90374 11.69483
## Proportion of Variance 0.01807 0.01753 0.01726 0.01705 0.01685 0.01627
## Cumulative Proportion 0.69833 0.71586 0.73312 0.75017 0.76702 0.78329
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 11.60734 11.57222 11.46507 11.15870 11.04915 10.99769
## Proportion of Variance 0.01603 0.01593 0.01564 0.01481 0.01452 0.01439
## Cumulative Proportion 0.79932 0.81525 0.83088 0.84569 0.86021 0.87460
## PC31 PC32 PC33 PC34 PC35 PC36
## Standard deviation 10.68517 10.45191 10.25430 10.13818 9.9583 9.68456
## Proportion of Variance 0.01358 0.01299 0.01251 0.01223 0.0118 0.01116
## Cumulative Proportion 0.88818 0.90117 0.91368 0.92591 0.9377 0.94886
## PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 9.50070 9.42259 9.3053 9.17335 8.95347 6.784e-14
## Proportion of Variance 0.01074 0.01056 0.0103 0.01001 0.00954 0.000e+00
## Cumulative Proportion 0.95960 0.97016 0.9805 0.99046 1.00000 1.000e+00
screeplot(pca) # this plot show the variance explained by the first 10 components
Variance explained by the first 10 components
# draw PCA plot
group <- c(rep("cadetblue1",18), rep("red",18), rep("purple",6) ) # vector of colors based on the order of my data
plot(pca$x[,1], pca$x[,2], xlab="PCA1", ylab="PCA2", main="PCA for components 1 and 2", type="p", pch=10, col=group)
text(pca$x[,1], pca$x[,2], rownames(pca$data), cex=0.75)
The vector group used in the PCA plot is based on the data. The samples corresponding to the colors are the following:
Light blue: reduction mammoplasty (RM) breast epithelium samples
Red: histologically normal (HN) epithelial samples from breast cancer patient
Purple: histologically normal breast epithelium (NlEpi) from prophylactic mastectomy patient samples
Let’s try to explore an interactive PCA plot.
components<-pca[["x"]]
components<-data.frame(components)
type<-c(rep("RM", 18), rep("HN",18), rep("NlEpi",6))
components<-cbind(components, type )
fig <- plot_ly(components, x=~PC1, y=~PC2,
color=type,colors=c('cadetblue1', 'red','purple'),
type='scatter',mode='markers')
fig
fig2 <- plot_ly(components, x=~PC1, y=~PC2, z=~PC3,
color=type, colors=c('cadetblue1', 'red','purple'),
mode='markers', marker = list(size = 4))
fig2
fig3 <- plot_ly(components, x=~PC1, y=~PC3,
color=type, colors=c('cadetblue1', 'red','purple'),
type='scatter',mode='markers')
fig3
umap_data <- umap::umap(t(normalized.log.ex),
n_neighbors = sqrt(dim(t(normalized.log.ex))[1]),#or the square root of the rows
min_dist = 0.1,
metric = "euclidean", #you can change it
n_components = 2) #used the default ones!
umap_df <- data.frame(umap_data$layout) %>% tibble::rownames_to_column('Samples')
umap_df$type<-c(rep("RM", 18), rep("HN",18), rep("NlEpi",6))
library(plotly)
figUmap <- plot_ly(umap_df,
x = ~X1, y = ~X2, color = umap_df$type,
colors = c('cadetblue1', 'red','purple'),
mode = 'markers',
size=1)
# Display the 2D scatter plot
figUmap
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
umap_data_3D <- umap::umap(t(normalized.log.ex),
n_neighbors = sqrt(dim(t(normalized.log.ex))[1]),
#or the square root of the rows, provato anche con 5 fa pena, anche 15 abbastanza pena
min_dist = 0.1,
metric = "euclidean", #you can change it
n_components = 3) #used the default ones!
umap_df_3D <- data.frame(umap_data_3D$layout) %>% tibble::rownames_to_column('Samples')
umap_df_3D$type<-c(rep("RM", 18), rep("HN",18), rep("NlEpi",6))
figUmap_3D <- plot_ly(umap_df_3D,
x = ~X1, y = ~X2, z=~X3, color = umap_df_3D$type,
colors = c('cadetblue1', 'red','purple'),
mode = 'markers',
size=1) %>% layout(title = 'Umap 1 control-tumors 3D')
# Display the 2D scatter plot
figUmap_3D
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
```